/*
 * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
 * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * $Id: osm_ucast_mgr.c 9494 2006-09-15 19:11:21Z halr $
 */


/*
 * Abstract:
 *    Implementation of osm_ucast_mgr_t.
 * This file implements the Unicast Manager object.
 *
 * Environment:
 *    Linux User Mode
 *
 * $Revision: 1.14 $
 */

#if HAVE_CONFIG_H
#  include <config.h>
#endif /* HAVE_CONFIG_H */

#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <iba/ib_types.h>
#include <complib/cl_qmap.h>
#include <complib/cl_debug.h>
#include <opensm/osm_ucast_mgr.h>
#include <opensm/osm_log.h>
#include <opensm/osm_node.h>
#include <opensm/osm_switch.h>
#include <opensm/osm_helper.h>
#include <opensm/osm_msgdef.h>
#include <opensm/osm_opensm.h>

#define LINE_LENGTH 256

/**********************************************************************
 **********************************************************************/
/*
 * This flag is used for stopping the relaxation algorithm if no
 * change detected during the fabric scan
 */
static boolean_t __some_hop_count_set;

/**********************************************************************
 **********************************************************************/
void
osm_ucast_mgr_construct(
  IN osm_ucast_mgr_t* const p_mgr )
{
  memset( p_mgr, 0, sizeof(*p_mgr) );
}

/**********************************************************************
 **********************************************************************/
void
osm_ucast_mgr_destroy(
  IN osm_ucast_mgr_t* const p_mgr )
{
  CL_ASSERT( p_mgr );

  OSM_LOG_ENTER( p_mgr->p_log, osm_ucast_mgr_destroy );

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
ib_api_status_t
osm_ucast_mgr_init(
  IN osm_ucast_mgr_t* const p_mgr,
  IN osm_req_t* const p_req,
  IN osm_subn_t* const p_subn,
  IN char* const p_report_buf,
  IN osm_log_t* const p_log,
  IN cl_plock_t* const p_lock )
{
  ib_api_status_t status = IB_SUCCESS;

  OSM_LOG_ENTER( p_log, osm_ucast_mgr_init );

  CL_ASSERT( p_req );
  CL_ASSERT( p_subn );
  CL_ASSERT( p_lock );

  osm_ucast_mgr_construct( p_mgr );

  p_mgr->p_log = p_log;
  p_mgr->p_subn = p_subn;
  p_mgr->p_lock = p_lock;
  p_mgr->p_req = p_req;
  p_mgr->p_report_buf = p_report_buf;

  OSM_LOG_EXIT( p_mgr->p_log );
  return( status );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_dump_path_distribution(
  IN const osm_ucast_mgr_t* const p_mgr,
  IN const osm_switch_t* const p_sw )
{
  osm_node_t *p_node;
  osm_node_t *p_remote_node;
  uint8_t i;
  uint8_t num_ports;
  uint32_t num_paths;
  ib_net64_t remote_guid_ho;
  char line[OSM_REPORT_LINE_SIZE];

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_dump_path_distribution );

  p_node = osm_switch_get_node_ptr( p_sw );
  num_ports = osm_switch_get_num_ports( p_sw );

  sprintf( p_mgr->p_report_buf, "__osm_ucast_mgr_dump_path_distribution: "
           "Switch 0x%" PRIx64 "\n"
           "Port : Path Count Through Port",
           cl_ntoh64( osm_node_get_node_guid( p_node ) ) );

  for( i = 0; i < num_ports; i++ )
  {
    num_paths = osm_switch_path_count_get( p_sw , i );
    sprintf( line, "\n %03u : %u", i, num_paths );
    strcat( p_mgr->p_report_buf, line );
    if( i == 0 )
    {
      strcat( p_mgr->p_report_buf, " (switch management port)" );
      continue;
    }

    p_remote_node = osm_node_get_remote_node( p_node, i, NULL );
    if( p_remote_node == NULL )
      continue;

    remote_guid_ho = cl_ntoh64( osm_node_get_node_guid( p_remote_node ) );

    switch(  osm_node_get_remote_type( p_node, i ) )
    {
    case IB_NODE_TYPE_SWITCH:
      strcat( p_mgr->p_report_buf, " (link to switch" );
      break;
    case IB_NODE_TYPE_ROUTER:
      strcat( p_mgr->p_report_buf, " (link to router" );
      break;
    case IB_NODE_TYPE_CA:
      strcat( p_mgr->p_report_buf, " (link to CA" );
      break;
    default:
      strcat( p_mgr->p_report_buf, " (link to unknown node type" );
      break;
    }

    sprintf( line, " 0x%" PRIx64 ")", remote_guid_ho );
    strcat( p_mgr->p_report_buf, line );
  }

  strcat( p_mgr->p_report_buf, "\n" );

  osm_log_raw( p_mgr->p_log, OSM_LOG_ROUTING, p_mgr->p_report_buf );

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_dump_ucast_routes(
  IN const osm_ucast_mgr_t*   const p_mgr,
  IN const osm_switch_t*      const p_sw,
  IN FILE *p_fdbFile )
{
  const osm_node_t*        p_node;
  uint8_t                  port_num;
  uint8_t                  num_hops;
  uint8_t                  best_hops;
  uint8_t                  best_port;
  uint16_t                 max_lid_ho;
  uint16_t                 lid_ho;
  char                     line[OSM_REPORT_LINE_SIZE];
  uint32_t                 line_num = 0;
  boolean_t                ui_ucast_fdb_assign_func_defined;
  
  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_dump_ucast_routes );

  p_node = osm_switch_get_node_ptr( p_sw );

  max_lid_ho = osm_switch_get_max_lid_ho( p_sw );

  for( lid_ho = 1; lid_ho <= max_lid_ho; lid_ho++ )
  {
    if( line_num == 0 )
    {
      sprintf( p_mgr->p_report_buf, "__osm_ucast_mgr_dump_ucast_routes: "
               "Switch 0x%016" PRIx64 "\n"
               "LID    : Port : Hops : Optimal\n",
               cl_ntoh64( osm_node_get_node_guid( p_node ) ) );
      line_num++;
    }

    port_num = osm_switch_get_port_by_lid( p_sw, lid_ho );
    if( port_num == OSM_NO_PATH )
    {
      /*
        This may occur if there are 'holes' in the existing
        LID assignments.  Running SM with --reassign_lids
        will reassign and compress the LID range.  The
        subnet should work fine either way.
      */
      sprintf( line, "0x%04X : UNREACHABLE\n", lid_ho );
      strcat( p_mgr->p_report_buf, line );
      line_num++;
      continue;
    }
    /*
      Switches can lie about which port routes a given
      lid due to a recent reconfiguration of the subnet.
      Therefore, ensure that the hop count is better than
      OSM_NO_PATH.
    */
    num_hops = osm_switch_get_hop_count( p_sw, lid_ho, port_num );
    if( num_hops == OSM_NO_PATH )
    {
      sprintf( line, "0x%04X : UNREACHABLE\n", lid_ho );
      strcat( p_mgr->p_report_buf, line );
      line_num++;
      continue;
    }

    best_hops = osm_switch_get_least_hops( p_sw, lid_ho );
    sprintf( line, "0x%04X : %03u  : %02u   : ",
             lid_ho, port_num, num_hops );
    strcat( p_mgr->p_report_buf, line );

    if( best_hops == num_hops )
      strcat( p_mgr->p_report_buf, "yes" );
    else
    {
      if (p_mgr->p_subn->p_osm->routing_engine.ucast_fdb_assign)
      {
        ui_ucast_fdb_assign_func_defined = TRUE;
      }
      else
      {
        ui_ucast_fdb_assign_func_defined = FALSE;
      }
      best_port = osm_switch_recommend_path(
        p_sw, lid_ho, TRUE,
        NULL, NULL, NULL, NULL, /* No LMC Optimization */
        ui_ucast_fdb_assign_func_defined );
      sprintf( line, "No %u hop path possible via port %u!",
               best_hops, best_port );
      strcat( p_mgr->p_report_buf, line );
    }

    strcat( p_mgr->p_report_buf, "\n" );

    if( ++line_num >= OSM_REPORT_BUF_THRESHOLD )
    {
      fprintf(p_fdbFile,"%s",p_mgr->p_report_buf );
      line_num = 0;
    }
  }

  if( line_num != 0 )
    fprintf(p_fdbFile,"%s\n",p_mgr->p_report_buf );

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
struct ucast_mgr_dump_context {
	osm_ucast_mgr_t *p_mgr;
	FILE *file;
};

static void
__osm_ucast_mgr_dump_table(
  IN cl_map_item_t* const  p_map_item,
  IN void* context )
{
  osm_switch_t* const p_sw = (osm_switch_t*)p_map_item;
  struct ucast_mgr_dump_context *cxt = context;

  if( osm_log_is_active( cxt->p_mgr->p_log, OSM_LOG_DEBUG ) )
    __osm_ucast_mgr_dump_path_distribution( cxt->p_mgr, p_sw );
  __osm_ucast_mgr_dump_ucast_routes( cxt->p_mgr, p_sw, cxt->file );
}

static void __osm_ucast_mgr_dump_tables(
  IN osm_ucast_mgr_t *p_mgr )
{
  char file_name[1024];
  struct ucast_mgr_dump_context dump_context;
  FILE  *file;
  
  strncpy(file_name, p_mgr->p_subn->opt.dump_files_dir, sizeof(file_name) - 1);
  strncat(file_name, "/osm.fdbs", sizeof(file_name) - strlen(file_name) - 1);
  
  file = fopen(file_name, "w");
  if (!file)
  {
    osm_log( p_mgr->p_log, OSM_LOG_ERROR,
             "__osm_ucast_mgr_dump_tables: ERR 3A12: "
             "Failed to open fdb file (%s)\n",
             file_name );
    return;
  }

  dump_context.p_mgr = p_mgr;
  dump_context.file = file;

  cl_qmap_apply_func( &p_mgr->p_subn->sw_guid_tbl,
                      __osm_ucast_mgr_dump_table, &dump_context );

  fclose(file);
}

/**********************************************************************
 Add each switch's own LID(s) to its LID matrix.
**********************************************************************/
static void
__osm_ucast_mgr_process_hop_0(
  IN cl_map_item_t* const  p_map_item,
  IN void* context )
{
  osm_switch_t* const p_sw = (osm_switch_t*)p_map_item;
  osm_ucast_mgr_t* const p_mgr = (osm_ucast_mgr_t*)context;
  osm_node_t *p_node;
  uint16_t lid_ho, base_lid_ho, max_lid_ho;
  cl_status_t status;
  uint8_t lmc;

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_process_hop_0 );

  p_node = p_sw->p_node;

  CL_ASSERT( p_node );
  CL_ASSERT( osm_node_get_type( p_node ) == IB_NODE_TYPE_SWITCH );

  /*
    Starting a rebuild, so notify the switch so it can
    clear tables, etc...
  */
  osm_switch_prepare_path_rebuild( p_sw );

  base_lid_ho = cl_ntoh16( osm_node_get_base_lid( p_node, 0 ) );
  if (osm_switch_is_sp0_enhanced( p_sw ))
    lmc = osm_node_get_lmc( p_node, 0 );
  else
    lmc = 0;
  max_lid_ho = (uint16_t)( base_lid_ho + (1 << lmc) - 1 );

  for (lid_ho = base_lid_ho; lid_ho <= max_lid_ho; lid_ho++)
  {
    if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
    {
      osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
               "__osm_ucast_mgr_process_hop_0: "
               "Processing switch GUID 0x%" PRIx64 ", LID 0x%X\n",
               cl_ntoh64( osm_node_get_node_guid( p_node ) ),
               lid_ho );
    }

    status = osm_switch_set_hops( p_sw, lid_ho, 0, 0 );
    if( status != CL_SUCCESS )
    {
      osm_log( p_mgr->p_log, OSM_LOG_ERROR,
               "__osm_ucast_mgr_process_hop_0: ERR 3A02: "
               "Setting hop count failed (%s) for "
               "switch GUID 0x%" PRIx64 ", LID 0x%X\n",
               CL_STATUS_MSG( status ),
               cl_ntoh64( osm_node_get_node_guid( p_node ) ),
               lid_ho );
    }
  }

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_process_neighbor(
  IN osm_ucast_mgr_t* const p_mgr,
  IN osm_switch_t* const p_sw,
  IN osm_switch_t* const p_remote_sw,
  IN const uint8_t port_num,
  IN const uint8_t remote_port_num )
{
  uint16_t lid_ho;
  uint16_t max_lid_ho;
  osm_node_t* p_node;
  const osm_node_t* p_remote_node;
  uint8_t hops;
  cl_status_t status;

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_process_neighbor );

  CL_ASSERT( p_sw );
  CL_ASSERT( p_remote_sw );
  CL_ASSERT( port_num );
  CL_ASSERT( remote_port_num );

  p_node = osm_switch_get_node_ptr( p_sw );
  p_remote_node = osm_switch_get_node_ptr( p_remote_sw );

  CL_ASSERT( p_node );
  CL_ASSERT( p_remote_node );

  CL_ASSERT( osm_node_get_type( p_node ) == IB_NODE_TYPE_SWITCH );
  CL_ASSERT( osm_node_get_type( p_remote_node ) == IB_NODE_TYPE_SWITCH );

  if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
  {
    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
             "__osm_ucast_mgr_process_neighbor: "
             "Node 0x%" PRIx64 ", remote node 0x%" PRIx64 "\n"
             "\t\t\t\tport 0x%X, remote port 0x%X\n",
             cl_ntoh64( osm_node_get_node_guid( p_node ) ),
             cl_ntoh64( osm_node_get_node_guid( p_remote_node ) ),
             port_num, remote_port_num );
  }

  /*
    Iterate through all the LIDs in the neighbor switch.
  */
  max_lid_ho = osm_switch_get_max_lid_ho( p_remote_sw );

  /*
    Make sure the local lid matrix has enough room to hold
    all the LID info coming from the remote LID matrix.
  */
  osm_switch_set_min_lid_size( p_sw, max_lid_ho );

  hops = OSM_NO_PATH;
  for( lid_ho = 1; lid_ho <= max_lid_ho; lid_ho++ )
  {
    /*
      Find the lowest hop count value to this LID.
    */
      hops = osm_switch_get_least_hops( p_remote_sw, lid_ho );

      if( hops != OSM_NO_PATH )
      {
        /*
          Increment hop count of the neighbor by 1, since it
          takes 1 hop to get to the neighbor.
        */
        hops++;

        CL_ASSERT( hops <= osm_switch_get_hop_count( p_sw, lid_ho,
                                                     port_num ) );
        if( osm_switch_get_hop_count( p_sw, lid_ho,
                                      port_num ) > hops )
        {
          if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
          {
            osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
                     "__osm_ucast_mgr_process_neighbor: "
                     "New best path is %u hops for LID 0x%X\n",
                     hops, lid_ho );
          }

          /* mark the fact we have got to change anything */
          __some_hop_count_set = TRUE;

          status = osm_switch_set_hops( p_sw, lid_ho,
                                        port_num, hops );
          if( status != CL_SUCCESS )
          {
            osm_log( p_mgr->p_log, OSM_LOG_ERROR,
                     "__osm_ucast_mgr_process_neighbor: ERR 3A03: "
                     "Setting hop count failed (%s)\n",
                     CL_STATUS_MSG( status ) );
          }
        }
      }
  }

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_process_leaf(
  IN osm_ucast_mgr_t* const p_mgr,
  IN osm_switch_t* const p_sw,
  IN osm_node_t* const p_node,
  IN const uint8_t port_num,
  IN osm_node_t* const p_remote_node,
  IN const uint8_t remote_port_num )
{
  uint16_t i;
  uint16_t base_lid_ho;
  uint16_t max_lid_ho;
  uint8_t lmc;

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_process_leaf );

  CL_ASSERT( p_node );
  CL_ASSERT( p_remote_node );
  CL_ASSERT( port_num );
  CL_ASSERT( remote_port_num );

  switch( osm_node_get_type( p_remote_node ) )
  {
  case IB_NODE_TYPE_CA:
  case IB_NODE_TYPE_ROUTER:
    base_lid_ho = cl_ntoh16( osm_node_get_base_lid(
                               p_remote_node, remote_port_num ) );
    lmc = osm_node_get_lmc( p_remote_node, remote_port_num );
    break;
#if 0
  case IB_NODE_TYPE_SWITCH:
    base_lid_ho = cl_ntoh16( osm_node_get_base_lid(
                               p_remote_node, 0 ) );
    lmc = 0;
    break;
#endif

  default:
    osm_log( p_mgr->p_log, OSM_LOG_ERROR,
             "__osm_ucast_mgr_process_leaf: ERR 3A01: "
             "Bad node type %u, GUID = 0x%" PRIx64 "\n",
             osm_node_get_type( p_remote_node ),
             cl_ntoh64( osm_node_get_node_guid( p_node ) ));
    goto Exit;
  }

  max_lid_ho = (uint16_t)(base_lid_ho + (1 << lmc) - 1 );

  if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
  {
    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
             "__osm_ucast_mgr_process_leaf: "
             "Discovered LIDs [0x%X,0x%X]\n"
             "\t\t\t\tport number 0x%X, node 0x%" PRIx64 "\n",
             base_lid_ho, max_lid_ho,
             port_num, cl_ntoh64( osm_node_get_node_guid( p_node ) ));
  }

  for( i = base_lid_ho; i <= max_lid_ho; i++ )
    osm_switch_set_hops( p_sw, i, port_num, 1 );

 Exit:
  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_process_leaves(
  IN cl_map_item_t* const  p_map_item,
  IN void* context )
{
  osm_switch_t* const p_sw = (osm_switch_t*)p_map_item;
  osm_ucast_mgr_t* const p_mgr = (osm_ucast_mgr_t*)context;
  osm_node_t *p_node;
  osm_node_t *p_remote_node;
  uint32_t port_num;
  uint8_t remote_port_num;
  uint32_t num_ports;

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_process_leaves );

  p_node = p_sw->p_node;

  CL_ASSERT( p_node );
  CL_ASSERT( osm_node_get_type( p_node ) == IB_NODE_TYPE_SWITCH );

  if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
  {
    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
             "__osm_ucast_mgr_process_leaves: "
             "Processing switch 0x%" PRIx64 "\n",
             cl_ntoh64( osm_node_get_node_guid( p_node ) ));
  }

  /*
    Add the LIDs of all leaves of this switch to the LID matrix.
    Don't bother processing loopback paths from one port of
    this switch to the another port.
    Don't process neighbor switches yet.
    Start with port 1 to skip the switch's management port.
  */
  num_ports = osm_node_get_num_physp( p_node );

  for( port_num = 1; port_num < num_ports; port_num++ )
  {
    p_remote_node = osm_node_get_remote_node( p_node,
                                              (uint8_t)port_num, &remote_port_num );

    if( p_remote_node && (p_remote_node != p_node )
        && (osm_node_get_type( p_remote_node )
            != IB_NODE_TYPE_SWITCH ) )
    {
      __osm_ucast_mgr_process_leaf(
        p_mgr,
        p_sw,
        p_node,
        (uint8_t)port_num,
        p_remote_node,
        remote_port_num );
    }
  }

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_process_port(
  IN osm_ucast_mgr_t* const p_mgr,
  IN osm_switch_t* const p_sw,
  IN const osm_port_t* const p_port )
{
  uint16_t min_lid_ho;
  uint16_t max_lid_ho;
  uint16_t lid_ho;
  uint8_t port;
  boolean_t ignore_existing, is_ignored_by_port_prof;
  ib_net64_t node_guid;
  boolean_t  ui_ucast_fdb_assign_func_defined;
  /*
    The following are temporary structures that will aid
    in providing better routing in LMC > 0 situations
  */
  uint16_t lids_per_port = 1 << p_mgr->p_subn->opt.lmc;
  uint64_t *remote_sys_guids = NULL;
  uint64_t *remote_node_guids = NULL;
  uint16_t num_used_sys = 0;
  uint16_t num_used_nodes = 0;

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_process_port );

  remote_sys_guids = malloc( sizeof(uint64_t) * lids_per_port );
  if( remote_sys_guids == NULL )
  {
    osm_log( p_mgr->p_log, OSM_LOG_ERROR,
             "__osm_ucast_mgr_process_port: ERR 3A09: "
             "Cannot allocate array. Insufficient memory\n");
    goto Exit;
  }

  memset( remote_sys_guids, 0, sizeof(uint64_t) * lids_per_port );

  remote_node_guids = malloc( sizeof(uint64_t) * lids_per_port );
  if( remote_node_guids == NULL )
  {
    osm_log( p_mgr->p_log, OSM_LOG_ERROR,
             "__osm_ucast_mgr_process_port: ERR 3A0A: "
             "Cannot allocate array. Insufficient memory\n");
    goto Exit;
  }

  memset( remote_node_guids, 0, sizeof(uint64_t) * lids_per_port );

  osm_port_get_lid_range_ho( p_port, &min_lid_ho, &max_lid_ho );

  /* If the lids are zero - then there was some problem with the initialization.
     Don't handle this port. */
  if ( min_lid_ho == 0 || max_lid_ho == 0 )
  {
    osm_log( p_mgr->p_log, OSM_LOG_ERROR,
             "__osm_ucast_mgr_process_port: ERR 3A04: "
             "Port 0x%" PRIx64 " has LID 0. An initialization "
             "error occurred. Ignoring port\n",
             cl_ntoh64( osm_port_get_guid( p_port ) ) );
    goto Exit;
  }

  if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
  {
    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
             "__osm_ucast_mgr_process_port: "
             "Processing port 0x%" PRIx64
             ", LIDs [0x%X,0x%X]\n",
             cl_ntoh64( osm_port_get_guid( p_port ) ),
             min_lid_ho, max_lid_ho );
  }

  /*
    TO DO - This should be runtime error, not a CL_ASSERT()
  */
  CL_ASSERT( max_lid_ho < osm_switch_get_fwd_tbl_size( p_sw ) );

  node_guid = osm_node_get_node_guid(osm_switch_get_node_ptr( p_sw ) );

  /* Flag to mark whether or not a ui ucast fdb assign function was given */
  if (p_mgr->p_subn->p_osm->routing_engine.ucast_fdb_assign)
    ui_ucast_fdb_assign_func_defined = TRUE;
  else
    ui_ucast_fdb_assign_func_defined = FALSE;

  /*
    If the user requested a complete subnet reconfiguration,
    then ignore existing paths when choosing paths now.
    Note that if there is a ui ucast fdb assign function - then
    ignore_existing should be false.
  */
  ignore_existing = p_mgr->p_subn->ignore_existing_lfts &&
    (!ui_ucast_fdb_assign_func_defined);

  /*
    The lid matrix contains the number of hops to each
    lid from each port.  From this information we determine
    how best to distribute the LID range across the ports
    that can reach those LIDs.
  */
  for( lid_ho = min_lid_ho; lid_ho <= max_lid_ho; lid_ho++ )
  {
    /* Use the enhanced algorithm only for LMC > 0 */
    if (lids_per_port > 1)
      port = osm_switch_recommend_path( p_sw, lid_ho, ignore_existing,
                                        remote_sys_guids, &num_used_sys,
                                        remote_node_guids, &num_used_nodes,
                                        ui_ucast_fdb_assign_func_defined );
    else
      port = osm_switch_recommend_path( p_sw, lid_ho, ignore_existing,
                                        NULL, NULL, NULL, NULL,
                                        ui_ucast_fdb_assign_func_defined );

    /*
      There might be no path to the target
    */
    if (port == OSM_NO_PATH)
    {
      /* do not try to overwrite the ppro of non existing port ... */ 
      is_ignored_by_port_prof = TRUE;

      /* Up/Down routing can cause unreachable routes between some 
         switches so we do not report that as an error in that case */
      if (!p_mgr->p_subn->p_osm->routing_engine.ucast_fdb_assign)
      {
        osm_log( p_mgr->p_log, OSM_LOG_ERROR,
                 "__osm_ucast_mgr_process_port: ERR 3A08: "
                 "No path to get to LID 0x%X from switch 0x%" PRIx64 "\n",
                 lid_ho, cl_ntoh64( node_guid ) );
        /* trigger a new sweep - try again ... */
        p_mgr->p_subn->subnet_initialization_error = TRUE;
      }
      else 
        osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
                 "__osm_ucast_mgr_process_port: "
                 "No path to get to LID 0x%X from switch 0x%" PRIx64 "\n",
                 lid_ho, cl_ntoh64( node_guid ) );
    }
    else
    {
      osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
	       "__osm_ucast_mgr_process_port: "
	       "Routing LID 0x%X to port 0x%X"
	       "\n\t\t\t\tFor switch 0x%" PRIx64 "\n",
	       lid_ho, port, cl_ntoh64( node_guid ) );

      /*
	we would like to optionally ignore this port in equalization
	like in the case of the Mellanox Anafa Internal PCI TCA port
      */
      is_ignored_by_port_prof =
	osm_port_prof_is_ignored_port(p_mgr->p_subn, cl_ntoh64(node_guid), port);

      /*
	We also would ignore this route if the target lid is of a switch
	and the port_profile_switch_node is not TRUE
      */
      if (! p_mgr->p_subn->opt.port_profile_switch_nodes)
      {
	is_ignored_by_port_prof |=
	  (osm_node_get_type(osm_port_get_parent_node(p_port)) ==
	   IB_NODE_TYPE_SWITCH);
      }
    }

    /*
      We have selected the port for this LID.
      Write it to the forwarding tables.
    */
    osm_switch_set_path( p_sw, lid_ho, port, is_ignored_by_port_prof );
  }

 Exit:
  if (remote_sys_guids)
    free(remote_sys_guids);
  if (remote_node_guids)
    free(remote_node_guids);
  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_set_table(
  IN osm_ucast_mgr_t* const p_mgr,
  IN osm_switch_t* const p_sw )
{
  osm_node_t *p_node;
  osm_dr_path_t *p_path;
  osm_madw_context_t context;
  ib_api_status_t status;
  ib_switch_info_t si;
  uint32_t block_id_ho = 0;
  uint8_t block[IB_SMP_DATA_SIZE];

  CL_ASSERT( p_mgr );

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_set_table );

  CL_ASSERT( p_sw );

  p_node = osm_switch_get_node_ptr( p_sw );

  CL_ASSERT( p_node );

  p_path = osm_node_get_any_dr_path_ptr( p_node );

  CL_ASSERT( p_path );

  /*
    Set the top of the unicast forwarding table.
  */
  si = *osm_switch_get_si_ptr( p_sw );
  si.lin_top = cl_hton16( osm_switch_get_max_lid_ho( p_sw ) );

  /* check to see if the change state bit is on. If it is - then we
     need to clear it. */
   if( ib_switch_info_get_state_change( &si ) )
    si.life_state = ( (p_mgr->p_subn->opt.packet_life_time <<3 )
                      | ( si.life_state & IB_SWITCH_PSC ) )  & 0xfc;
  else
    si.life_state = (p_mgr->p_subn->opt.packet_life_time <<3 ) & 0xf8;

  if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
  {
    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
             "__osm_ucast_mgr_set_table: "
             "Setting switch FT top to LID 0x%X\n",
             osm_switch_get_max_lid_ho( p_sw ) );
  }

  context.si_context.light_sweep = FALSE;
  context.si_context.node_guid = osm_node_get_node_guid( p_node );
  context.si_context.set_method = TRUE;

  status = osm_req_set( p_mgr->p_req,
                        p_path,
                        (uint8_t*)&si,
                        sizeof(si),
                        IB_MAD_ATTR_SWITCH_INFO,
                        0,
                        CL_DISP_MSGID_NONE,
                        &context );

  if( status != IB_SUCCESS )
  {
    osm_log( p_mgr->p_log, OSM_LOG_ERROR,
             "__osm_ucast_mgr_set_table: ERR 3A06: "
             "Sending SwitchInfo attribute failed (%s)\n",
             ib_get_err_str( status ) );
  }

  /*
    Send linear forwarding table blocks to the switch
    as long as the switch indicates it has blocks needing
    configuration.
  */

  context.lft_context.node_guid = osm_node_get_node_guid( p_node );
  context.lft_context.set_method = TRUE;

  while( osm_switch_get_fwd_tbl_block( p_sw, block_id_ho, block ) )
  {
    if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
    {
      osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
               "__osm_ucast_mgr_set_table: "
               "Writing FT block %u\n", block_id_ho );
    }

    status = osm_req_set( p_mgr->p_req,
                          p_path,
                          block,
                          sizeof(block),
                          IB_MAD_ATTR_LIN_FWD_TBL,
                          cl_hton32( block_id_ho ),
                          CL_DISP_MSGID_NONE,
                          &context );

    if( status != IB_SUCCESS )
    {
      osm_log( p_mgr->p_log, OSM_LOG_ERROR,
               "__osm_ucast_mgr_set_table: ERR 3A05: "
               "Sending linear fwd. tbl. block failed (%s)\n",
               ib_get_err_str( status ) );
    }

    block_id_ho++;
  }

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_process_tbl(
  IN cl_map_item_t* const  p_map_item,
  IN void* context )
{
  osm_switch_t* const p_sw = (osm_switch_t*)p_map_item;
  osm_ucast_mgr_t* const p_mgr = (osm_ucast_mgr_t*)context;
  osm_node_t *p_node;
  const osm_port_t *p_port;
  const cl_qmap_t* p_port_tbl;

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_process_tbl );

  p_node = p_sw->p_node;

  CL_ASSERT( p_node );
  CL_ASSERT( osm_node_get_type( p_node ) == IB_NODE_TYPE_SWITCH );

  if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
  {
    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
             "__osm_ucast_mgr_process_tbl: "
             "Processing switch 0x%" PRIx64 "\n",
             cl_ntoh64( osm_node_get_node_guid( p_node ) ));
  }

  p_port_tbl = &p_mgr->p_subn->port_guid_tbl;

  /*
    Iterate through every port setting LID routes for each
    port based on base LID and LMC value.
  */

  for( p_port = (osm_port_t*)cl_qmap_head( p_port_tbl );
       p_port != (osm_port_t*)cl_qmap_end( p_port_tbl );
       p_port = (osm_port_t*)cl_qmap_next( &p_port->map_item ) )
  {
    __osm_ucast_mgr_process_port( p_mgr, p_sw, p_port );
  }

  __osm_ucast_mgr_set_table( p_mgr, p_sw );

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_set_table_cb(
  IN cl_map_item_t* const  p_map_item,
  IN void* context )
{
  osm_switch_t* const p_sw = (osm_switch_t*)p_map_item;
  osm_ucast_mgr_t* const p_mgr = (osm_ucast_mgr_t*)context;

  __osm_ucast_mgr_set_table( p_mgr, p_sw );
}

/**********************************************************************
 **********************************************************************/
static void
__osm_ucast_mgr_process_neighbors(
  IN cl_map_item_t* const  p_map_item,
  IN void* context )
{
  osm_switch_t* const p_sw = (osm_switch_t*)p_map_item;
  osm_ucast_mgr_t* const p_mgr = (osm_ucast_mgr_t*)context;
  osm_node_t *p_node;
  osm_node_t *p_remote_node;
  ib_net64_t remote_node_guid;
  osm_switch_t *p_remote_sw;
  uint32_t port_num;
  uint8_t remote_port_num;
  uint32_t num_ports;
  osm_physp_t* p_physp;

  OSM_LOG_ENTER( p_mgr->p_log, __osm_ucast_mgr_process_neighbors );

  p_node = p_sw->p_node;

  CL_ASSERT( p_node );
  CL_ASSERT( osm_node_get_type( p_node ) == IB_NODE_TYPE_SWITCH );

  if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
  {
    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
             "__osm_ucast_mgr_process_neighbors: "
             "Processing switch with GUID = 0x%" PRIx64 "\n",
             cl_ntoh64( osm_node_get_node_guid( p_node ) ) );
  }

  num_ports = osm_node_get_num_physp( p_node );

  /*
    Start with port 1 to skip the switch's management port.
  */
  for( port_num = 1; port_num < num_ports; port_num++ )
  {
    p_remote_node = osm_node_get_remote_node( p_node,
                                              (uint8_t)port_num, &remote_port_num );

    if( p_remote_node && (p_remote_node != p_node )
        && (osm_node_get_type( p_remote_node )
            == IB_NODE_TYPE_SWITCH ) )
    {
      /* make sure the link is healthy. If it is not - don't 
         propagate through it. */
      p_physp = osm_node_get_physp_ptr( p_node, port_num );
      if (!osm_link_is_healthy( p_physp ) ) continue;

      remote_node_guid = osm_node_get_node_guid( p_remote_node );

      p_remote_sw = (osm_switch_t*)cl_qmap_get(
        &p_mgr->p_subn->sw_guid_tbl, remote_node_guid );

      if( p_remote_sw == (osm_switch_t*)cl_qmap_end(
            &p_mgr->p_subn->sw_guid_tbl ) )
      {
        osm_log( p_mgr->p_log, OSM_LOG_ERROR,
                 "__osm_ucast_mgr_process_neighbors: ERR 3A07: "
                 "No switch object for Node GUID = 0x%" PRIx64 "\n",
                 cl_ntoh64( remote_node_guid ) );
      }
      else
      {
        __osm_ucast_mgr_process_neighbor(
          p_mgr,
          p_sw,
          p_remote_sw,
          (uint8_t)port_num,
          remote_port_num );
      }
    }
  }

  OSM_LOG_EXIT( p_mgr->p_log );
}

/**********************************************************************
 **********************************************************************/
osm_signal_t
osm_ucast_mgr_process(
  IN osm_ucast_mgr_t* const p_mgr )
{
  uint32_t i;
  uint32_t iteration_max;
  struct osm_routing_engine *p_routing_eng;
  osm_signal_t signal;
  cl_qmap_t *p_sw_guid_tbl;

  OSM_LOG_ENTER( p_mgr->p_log, osm_ucast_mgr_process );

  p_sw_guid_tbl = &p_mgr->p_subn->sw_guid_tbl;
  p_routing_eng = &p_mgr->p_subn->p_osm->routing_engine;

  CL_PLOCK_EXCL_ACQUIRE( p_mgr->p_lock );

  osm_log(p_mgr->p_log, OSM_LOG_VERBOSE,
          "osm_ucast_mgr_process: "
          "Starting switches Min Hop Table Assignment\n");
  
  /*
    Set the switch matrices for each switch's own port 0 LID(s)
    then set the lid matrices for the each switch's leaf nodes.
  */
  cl_qmap_apply_func( p_sw_guid_tbl,
                      __osm_ucast_mgr_process_hop_0, p_mgr );

  cl_qmap_apply_func( p_sw_guid_tbl,
                      __osm_ucast_mgr_process_leaves, p_mgr );

  /*
    Get the switch matrices for each switch's neighbors.
    This process requires a number of iterations equal to
    the number of switches in the subnet minus 1.

    In each iteration, a switch learns the lid/port/hop
    information (as contained by a switch's lid matrix) from
    its immediate neighbors.  After each iteration, a switch
    (and it's neighbors) know more routing information than
    it did on the previous iteration.
    Thus, by repeatedly absorbing the routing information of
    neighbor switches, every switch eventually learns how to
    route all LIDs on the subnet.

    Note that there may not be any switches in the subnet if
    we are in simple p2p configuration.
  */
  iteration_max = cl_qmap_count( &p_mgr->p_subn->sw_guid_tbl );

  /*
    If there are switches in the subnet, iterate until the lid
    matrix has been constructed.  Otherwise, just immediately
    indicate we're done if no switches exist.
  */
  if( iteration_max )
  {
    iteration_max--;

    /*
      we need to find out when the propagation of
      hop counts has relaxed. So this global variable
      is preset to 0 on each iteration and if
      if non of the switches was set will exit the
      while loop
    */
    __some_hop_count_set = TRUE;
    for( i = 0; (i < iteration_max) && __some_hop_count_set; i++ )
    {
      __some_hop_count_set = FALSE;
      cl_qmap_apply_func( p_sw_guid_tbl,
                          __osm_ucast_mgr_process_neighbors, p_mgr );
    }
    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
             "osm_ucast_mgr_process: "
             "Min-hop propagated in %d steps\n",
             i
             );

    if (p_routing_eng->ucast_build_fwd_tables &&
        p_routing_eng->ucast_build_fwd_tables(p_routing_eng->context) == 0)
    {
      cl_qmap_apply_func( p_sw_guid_tbl,
                          __osm_ucast_mgr_set_table_cb, p_mgr );
    } /* fallback on the regular path in case of failures */
    else
    {
    /*
      This is the place where we can load pre-defined routes
      into the switches fwd_tbl structures.

      Later code will use these values if not configured for
      reassignment.
    */
      if (p_routing_eng->ucast_fdb_assign)
      {
        if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
        {
          osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
                   "osm_ucast_mgr_process: "
                   "Invoking \'%s\' function ucast_fdb_assign\n",
                   p_routing_eng->name );
        }

        p_routing_eng->ucast_fdb_assign(p_routing_eng->context);

      }
      else
      {
        osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
                 "osm_ucast_mgr_process: "
                 "UI pfn was not invoked\n" );
      }

      osm_log( p_mgr->p_log, OSM_LOG_INFO,
               "osm_ucast_mgr_process: "
               "Min Hop Tables configured on all switches\n" );

      /*
        Now that the lid matrices have been built, we can
        build and download the switch forwarding tables.
      */

      cl_qmap_apply_func( p_sw_guid_tbl,
                          __osm_ucast_mgr_process_tbl, p_mgr );
    }

    /* dump fdb into file: */
    if ( osm_log_is_active( p_mgr->p_log, OSM_LOG_ROUTING ) )
      __osm_ucast_mgr_dump_tables( p_mgr );

    /*
      For now don't bother checking if the switch forwarding tables
      actually needed updating.  The current code will always update
      them, and thus leave transactions pending on the wire.
      Therefore, return OSM_SIGNAL_DONE_PENDING.
    */
    signal = OSM_SIGNAL_DONE_PENDING;
  }
  else
    signal = OSM_SIGNAL_DONE;

  osm_log(p_mgr->p_log, OSM_LOG_VERBOSE,
          "osm_ucast_mgr_process: "
          "LFT Tables configured on all switches\n");

  CL_PLOCK_RELEASE( p_mgr->p_lock );
  OSM_LOG_EXIT( p_mgr->p_log );
  return( signal );
}
